#!/usr/bin/env python
# -*-coding:utf-8-*-
#============================================================#
#WOE

#ref_table by chimerge rules
#apply ref_table

#============================================================#
#system arguments
from optparse import OptionParser
import contingency as cy
from IPython.display import display_html
from itertools import chain

usage = '''
NOTE:
1. Get WOE reference table for infile.
'''

import os
import sys
import math
import copy
import operator
import pandas as pd
import scipy.stats
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')



# locate bins for numeric variables


def merge_samll_bins(key,mapping):
    if key in mapping:
        value = mapping[key]
    else:
        value = str(key)
    return value  


# calculate pivot table
def pivot(df,var,tgt):
    pivot = df[[var,tgt]].groupby([var,tgt]).count().unstack()[var]
    pivot['tgt_rt'] = pivot[1]/(pivot[0]+pivot[1])
    return pivot

# locate bins for numeric variables
def bin_loc(value,uvbucket):
    bins = np.empty(0)
    for i in range(len(uvbucket)-1):
       if value > uvbucket[i] and value <= uvbucket[i+1]:
           bins = (uvbucket[i],uvbucket[i+1])
       if value == np.min(uvbucket):
           bins = (uvbucket[0],uvbucket[1])     
    return bins

def cag_woe_calc(var_bin_dict,value):
    for i in list(var_bin_dict.keys()):
        if value in var_bin_dict[i]:
            string = i
    return string   

def bin_calc(var_bin_dict,value):
    bin_value =  '_'.join(var_bin_dict[value])
    return bin_value

# calculate woe
def woe_calc(bad,good,goodfreq,badfreq):
    target_rt = float(bad)/float(badfreq)
    non_target_rt = float(good)/float(goodfreq)
    if float(bad) != 0.0 and float(bad)/(float(bad) + float(good)) != 1.0:
        woe = math.log(float(target_rt/non_target_rt))  
    elif target_rt == 0.0:
        woe = -99999999.0  
    elif float(bad)/(float(bad) + float(good)) == 1.0:
        woe = 99999999.0  
    return woe

def woe_calc_base(bad,good):
    try:
        woe = math.log(float(bad/float(bad))/float(good/float(good))) 
    except:
        woe = -999999  
    return woe    


def iv_calc(df,var):
    bad_dist = df[1]/df[1].sum()
    good_dist = df[0]/df[0].sum()
    bad_dist = bad_dist.apply(lambda x: 0.0001 if x == 0 else x)
    iv_bin = df[var +'_iv_bin'] = (bad_dist - good_dist) * \
                         (bad_dist / good_dist).apply(lambda x: math.log(x))
    iv = iv_bin.sum()
    return iv
    
    
#cvlookup function
def cvlookup(table,key):
    if key in table:
        if table[key] == '-99999999.0' :
            value = table['base']
        else:    
            value = table[key]
    else:
        value = 0    
    return float(value)


#nvlookup function
#def nvlookup(table,value):
#
#    keylist = table.keys()
#    keylist.sort()
#
#    kmaxrange = keylist[-2]
#    kminrange = keylist[0]
#    kmax = keylist[-2].split('_')
#    kmin = keylist[0].split('_')
#
#    value = round(value,8)
#        
#    for key in table.keys():
#
#        if key != 'base':
#            krange = key.split('_')
#            
#            if value >=  float(krange[0]) and value < float(krange[1]):
#
#                if table[key] == '-99999999.0':
#                    ref = 0
#                else:
#                    ref = table[key]
#                break    
#            elif value >= float(kmax[1]):
#                ref = table[kmaxrange]
#            elif value < float(kmin[1]):  
#                ref = table[kminrange]
#            
#    return ref


def nvlookup(table,value):

    keylist = list(table.keys())
    keylist.sort()
    
    keymin = [float(x.split('_')[0]) for x in keylist[:-1]]
    keymin.sort()
    keymax = [float(x.split('_')[1]) for x in keylist[:-1]]   
    keymax.sort()
    
    index_keymin = [1 if x<=value else 0 for x in keymin]
    index_keymax = [1 if x>value  else 0 for x in keymax]
    index_key = list(map(lambda x, y: x and y, index_keymin, index_keymax))    
    
    if any(index_key):
        ref = table[str(keymin[index_key.index(1)])+'_'+str(keymax[index_key.index(1)])]
    elif value < keymin[0]:
        ref = table[str(keymin[0])+'_'+str(keymax[0])]
    else:
        ref = table[str(keymin[-1])+'_'+str(keymax[-1])]
 
    return ref

def str_convert(code):
    if type(code) in [int, float,np.float64]:
        result = str(int(code))
    elif type(code) is str:
        result = code
    elif type(code) is str:
        result = code.encode('utf8')
    else:
        result = code
    return result   


# calculate time delta
def time_delta(d1,d2):
    delta = 12 * (int(d1/100) - int(d2/100)) + (d1 - int(d1/100) * 100) - (d2 - int(d2/100) * 100)
    return delta



def main_calc_numeric_ref_table(df,var,tgt,max_bins,var_char_loc,bins,to_plot = True):
    start_time = datetime.now()

    '''
    ------- 1. Initialize: create the numeric bins -------#
    '''

    # create bucket
    if len(df[var].unique()) < max_bins:
        uvalue = np.sort(df[var].unique())
        uvdiff = np.append(np.diff(np.sort(df[var].unique())).astype(float)/2,0)
        uvbucket = np.append(uvalue.min(),uvalue + uvdiff)
        uvbucket = uvalue
        print('  unique value less than 1000')
    else:
        uvalue = np.empty(0)
        for i in np.arange(max_bins+1):
            try:
                uvalue = np.unique((np.append(uvalue,df[var].quantile(float(i)/float(max_bins)))))
            except:
                pass    
        uvdiff = np.append(np.diff(uvalue).astype(float)/2,0)
        uvbucket = np.append(uvalue.min(),uvalue + uvdiff)
        uvbucket = uvalue
        print('  unique value greater than 1000')

    if len(bins) == 0:
        df[var+'_bin'] = [tuple([float(j) for j in i.strip('([]').split(',')]) for i in np.array(pd.cut(df[var],uvbucket,retbins=True,include_lowest = True)[0])]
    else:
        df[var+'_bin'] = [tuple([float(j) for j in i.strip('([]').split(',')]) for i in np.array(pd.cut(df[var],bins,retbins=True,include_lowest = True)[0])]

    ds = df.groupby([var+'_bin',tgt]).count().unstack()[var].fillna(value=0) 

    ds['bin'] = [[str(i[0]),str(i[1])] for i in list(ds.index)]

    end_time = datetime.now()

    '''
    #------- get the reference table -------#
    '''
    ds = ds.reset_index(drop=True)

    ds['ref_table'] = None
    goodfreq = ds[0].sum()  
    badfreq = ds[1].sum()
    ds[var +'_woe'] = ds.apply(lambda x: woe_calc(x[1],x[0],goodfreq,badfreq), axis = 1)
    ds['ref_table'] = ds['bin'].apply(lambda x: x[0] + '_' + x[1])

    print(ds)
    iv = iv_calc(ds,var +'_woe')
    if to_plot:
        plt.bar(list(range(len(ds[1].values))), ds[1].values + ds[0].values)
        plt.bar(list(range(len(ds[1].values))), ds[1].values,color='r')
        ax2 = plt.twinx()
        x = ds.index + 0.5
        y = ds[var +'_woe'].values
        plt.plot(np.arange(len(x))+0.5, y, '.-k', linewidth=2, markersize=10)
        my_xticks = ds['bin'].values
        plt.xticks(x, my_xticks)   
        for i,j in zip(np.arange(len(x))+0.5,y):
            ax2.annotate(str(round(j,2)),xy=(i,j),va="center", ha="left",
                      bbox=dict(boxstyle="round", fc="w"))
        plt.show()
        plt.title('{}:{}'.format(var,iv)) 
        plt.savefig(var_char_loc + var)   
        plt.close() 
    
    print(('  IV: {}'.format(iv)))

    ref_table = {}
    ref_table = dict(list(zip(ds['ref_table'],ds[var +'_woe'])))   
    ref_table['base'] = woe_calc_base(ds[1].sum(),ds[0].sum())
    end_time = datetime.now()
    
    #print("\n  #--------------- get the reference table --------------#") 
    #print('  Duration of getting the reference table: {}'.format(end_time - start_time)) 


    return ref_table,iv

# 数值型WOE
# main function: get the reference table for numeric variables
def main_get_numeric_ref_table(df,var,tgt,max_bins,var_char_loc,dfins=None,drop_na=False,to_plot = True):
    start_time = datetime.now()
    # start_time_tes = datetime.now()
    # print('=====a=====',datetime.now() - start_time_tes)
    '''
    ------- 1. Initialize: create the numeric bins -------#
    '''
    #df[var] = df[var].round(8) 
    if drop_na:
        df = df[[var,tgt]].dropna(how='any').reset_index(drop=True).copy()
    # create bucket
    uin_value = df[var].unique()
    if len(uin_value) < max_bins:
        uvalue = np.sort(uin_value)
        uvdiff = np.append(np.diff(np.sort(uin_value))/2,0)
        uvbucket = np.unique(np.append(uvalue.min(),uvalue + uvdiff))
       
        #uvbucket = uvalue
        print('  unique value less than 1000')
    else:
        uvalue = np.empty(0)
        for i in np.arange(max_bins+1):
            try:
                #uvalue = np.unique((np.append(uvalue,round(df[var].quantile(float(i)/float(max_bins)),8))))
                uvalue = np.unique((np.append(uvalue,df[var].quantile(float(i)/float(max_bins)))))
            except:
                pass    
        uvdiff = np.append(np.diff(uvalue)/2,0)
        uvbucket = np.unique(np.append(uvalue.min(),uvalue + uvdiff))
        #uvbucket = uvalue
        print('  unique value greater than 1000')

    # print('=====b=====', datetime.now() - start_time_tes)

    df[var+'_bin'] = [tuple([float(j) for j in str(i).strip('([]').split(',')]) for i in np.array(pd.cut(df[var],uvbucket,retbins=True,include_lowest = True)[0])]
    
    ds = df.groupby([var+'_bin',tgt]).count().unstack()[var].fillna(value=0)

    # print('=====d=====', datetime.now() - start_time_tes)
    
    ds['bin'] = [[str(i[0]),str(i[1])] for i in list(ds.index)]
    ds['bin_lb'] = [str(i[0]) for i in list(ds.index)]

    ds = ds.reset_index(drop = True)
    chisq = []
    # print('=====d=====', datetime.now() - start_time_tes)
    for i in range(ds.shape[0]-1):
        chisq.append(round(cy.chi2_contingency(ds.iloc[i:i+2,][[0,1]])[0],11))
    chisq.append(9999999.0)
    ds['chisq'] = chisq
    # print(chisq)
    # print('=====d=====', datetime.now() - start_time_tes)
    '''
    #------- 2. chimerge: merge the adjacent bins -------#
    '''
    start_time = datetime.now()
    # display_html(ds)
    while ds.shape[0] > 6 or ds.chisq.min() <= scipy.stats.chi2.ppf(0.95, 1):
        # print('=====%s, %s====='%(i,ds.shape[0]), datetime.now())
        start_time = datetime.now()
        # locate the smallest chisq stat bin by index
        #k = np.where(ds.chisq == ds.chisq.min())[0][0]
        # ds_idx_list = list(ds.index)

        k = ds['chisq'].idxmin()
        ds.iloc[k,0:2] = ds.iloc[k,0:2] + ds.iloc[k+1,0:2]
        ds['bin'].iat[k] = [ds['bin'].iat[k][0],ds['bin'].iat[k+1][1]]
        ds.drop(k+1,inplace=True)
        ds.reset_index(drop = True,inplace=True)
        if k != 0:
            ds['chisq'].iat[k-1] = cy.chi2_contingency(ds.iloc[k-1:k+1,0:2])[0]
        if k < ds.shape[0] - 1:
            ds['chisq'].iat[k] = cy.chi2_contingency(ds.iloc[k:k+2,0:2])[0]
        else:
            ds['chisq'].iat[k] = 9999999.0
        # k = ds_idx_list.index(ds[ds.chisq == ds.chisq.min()].index[0])
        # merge the adjacent bins, drop the second bin
        # ds.ix[ds_idx_list[k],0:2] = ds.ix[ds_idx_list[k],0:2] + ds.ix[ds_idx_list[k+1],0:2]
        # ds['bin'].ix[ds_idx_list[k]] = [ds['bin'].ix[ds_idx_list[k]][0],ds['bin'].ix[ds_idx_list[k+1]][1]]
        # ds = ds.drop(ds_idx_list[k+1])
        # ds = ds.reset_index(drop = True)
        # ds_idx_list = list(ds.index)
        # print('a', datetime.now()-start_time)
        # if k != 0:
        #     ds['chisq'].ix[ds_idx_list[k-1]] = cy.chi2_contingency(ds.ix[ds_idx_list[k-1:k+1],0:2])[0]
        # if k < ds.shape[0] - 1:
        #     ds['chisq'].ix[ds_idx_list[k]] = cy.chi2_contingency(ds.ix[ds_idx_list[k:k+2],0:2])[0]
        # else:
        #     ds['chisq'].ix[ds_idx_list[k]] = 9999999.0
        end_time = datetime.now()
        #print('  Duration of merge bins for each iteration: {}'.format(end_time - start_time))
        # print('=====d2=====', datetime.now())
        # print(i, end_time-start_time)

    end_time = datetime.now()
    # print('=====e=====', datetime.now() - start_time_tes)

    #print("\n  #--------------- 2. Merge bins by chisq rules Done --------------#")
    print(('  Duration of merge bins by chisq rules: {}'.format(end_time - start_time)))  
    print(("  shape of the reduced table: {}".format(ds.shape))) 


    '''
    #-------- 3. chimerge: control bin size -------#
    '''
    

    pop_cut = (ds[0].sum() + ds[1].sum())/20
    ds['pop'] = ds[0] + ds[1]
    # print('=====f=====', datetime.now() - start_time_tes)
    # display_html(ds)
    while ds['pop'].min() < pop_cut:
        # print('==========ds================')
        # display_html(ds)
        # calculate chisquare statistic
        chisq = []
        for i in range(ds.shape[0]-1):
            chisq.append(cy.chi2_contingency(ds.iloc[i:i+2,][[0,1]])[0])
        chisq.append(9999999.0)   
        ds['chisq'] = chisq 

        # locate the smallest size by index
        # ds_idx_list = list(ds.index)
        # k = ds_idx_list.index(ds[ds['pop'] == ds['pop'].min()].index[0])
        k = ds['pop'].idxmin()
        # if k == len(ds_idx_list) - 1 :
        # print('k',k)
        if k == ds.shape[0] - 1 :
            k -= 1
        elif ds['chisq'].iat[k] > ds['chisq'].iat[k-1]:
            k -= 1
        
        ds.iloc[k,0:2] = ds.iloc[k,0:2] + ds.iloc[k+1,0:2]
        ds['bin'].iat[k] = [ds['bin'].iat[k][0],ds['bin'].iat[k+1][1]]    
        ds['bin_lb'].iat[k] = ds['bin'].iat[k][0]
        ds.drop(k+1,inplace=True)
        ds['pop'] = ds[0] + ds[1]
        ds.reset_index(drop=True,inplace=True)

        # merge the adjacent bins, drop the second bin   
        # ds.ix[ds_idx_list[k],0:2] = ds.ix[ds_idx_list[k],0:2] + ds.ix[ds_idx_list[k+1],0:2]
        # ds['bin'].ix[ds_idx_list[k]] = [ds['bin'].ix[ds_idx_list[k]][0],ds['bin'].ix[ds_idx_list[k+1]][1]]    
        # ds['bin_lb'].ix[ds_idx_list[k]] = ds['bin'].ix[ds_idx_list[k]][0]  
        # ds = ds.drop(ds_idx_list[k+1])
        # ds['pop'] = ds[0] + ds[1]

    #print("\n  #--------------- 3. Done: merge bins by bin size --------------#")  
    print(("  shape of the final reduced table: {}".format(ds.shape)))
    # print('=====g=====', datetime.now() - start_time_tes)

    
    '''
    #------- get the reference table -------#
    '''
    # ds = ds.reset_index(drop=True)
    # ds.reset_index(drop=True,inplace=True)
    ds['ref_table'] = None
    goodfreq = ds[0].sum()
    badfreq = ds[1].sum()
    ds[var +'_woe'] = ds.apply(lambda x: woe_calc(x[1],x[0],goodfreq,badfreq), axis = 1)
    ds['ref_table'] = ds['bin'].apply(lambda x: x[0] + '_' + x[1])
    iv = iv_calc(ds,var +'_woe')
    # print('=====h=====', datetime.now() - start_time_tes)
    #print ds
    ds['Target_Rate'] = ds[1] / (ds[0] + ds[1])
    ds['Cnt'] = ds[0] + ds[1]
    
    if dfins is not None:
        # print(dfins[var].min(),dfins[var].max())
        bins = sorted(list(set([float(x) for x in list(chain.from_iterable(ds['bin'].tolist()))])))
        if bins[-1] < dfins[var].max():
            bins.pop()
            bins.append(dfins[var].max())

        if bins[0] > dfins[var].min():
            bins.pop(0)
            bins = [dfins[var].min()] + bins
        dfins[var + '_bin'] = [tuple([float(j) for j in str(i).strip('([]').split(',')]) for i in
                            np.array(pd.cut(dfins[var], bins, retbins=True, include_lowest=True)[0])]
        dss = dfins.groupby([var + '_bin', tgt]).count().unstack()[var].fillna(value=0)
        # display_html(dss)
        dss['bin'] = [[str(i[0]), str(i[1])] for i in list(dss.index)]
        dss['bin_lb'] = [str(i[0]) for i in list(dss.index)]
        dss = dss.reset_index(drop=True)
        tgoodfreq = dss[0].sum()
        tbadfreq = dss[1].sum()
        dss[var + '_woe'] = dss.apply(lambda x: woe_calc(x[1], x[0], tgoodfreq, tbadfreq), axis=1)
        iv_test = iv_calc(dss, var + '_woe')
        dss['Target_Rate'] = dss[1] / (dss[0] + dss[1])
        # display_html(dss)
    # print('=====i=====', datetime.now() - start_time_tes)

    if to_plot:
        barylim = ds[1].values.sum() + ds[0].values.sum() + (ds[1].values.sum() + ds[0].values.sum())/7
        minval_woe = ds[var +'_woe'].values.min()
        maxval_woe = ds[var +'_woe'].values.max()
        minval_trt = ds['Target_Rate'].values.min()
        maxval_trt = ds['Target_Rate'].values.max()
        if dfins is not None:
            minval_woe = min([minval_woe, dss[var +'_woe'].values.min()])
            maxval_woe = min([maxval_woe, dss[var +'_woe'].values.max()])
            minval_trt = min([minval_trt, dss['Target_Rate'].values.max()])
            maxval_trt = min([maxval_trt, dss['Target_Rate'].values.max()])
        minval = min([minval_woe, minval_trt])
        maxval = max([maxval_woe, maxval_trt])
        if minval < 0:
            lineylim_min = minval + minval / 2
        else:
            lineylim_min = minval - minval / 2
        if maxval > 0:
            lineylim_max = maxval + maxval / 2
        else:
            lineylim_max = maxval - maxval / 2

        if dfins is not None:
            # 测试集的图
            plt.figure(figsize=(18,7))
            plt.subplot(122)
            plt.bar(list(range(len(ds[1].values))), dss[1].values + dss[0].values,color='#86A4DA')
            plt.bar(list(range(len(ds[1].values))), dss[1].values, color='#EAAA79')
            plt.ylim((0, barylim))
            ax2 = plt.twinx()
            x = ds.index + 0.5
            y = dss[var +'_woe'].values
            # y = dss['Target_Rate'].values
            plt.plot(np.arange(len(x)) + 0.5, y, '.-k', linewidth=2, markersize=10, color='#7AB7B5')
            plt.plot(np.arange(len(x)) + 0.5, dss['Target_Rate'].values, '.-k', linewidth=2, markersize=10,color='#8C8C8C')

            plt.ylim((lineylim_min, lineylim_max))
            my_xticks = ds['bin'].values
            plt.xticks(x, my_xticks)

            for i, j in zip(np.arange(len(x)) + 0.5, y):
                ax2.annotate(str(round(j, 4)), xy=(i, j), va="center", ha="left",
                             bbox=dict(boxstyle="round", fc="w"))
            for i, j in zip(np.arange(len(x)) + 0.5, dss['Target_Rate'].values):
                ax2.annotate(str(round(j, 4)), xy=(i, j), va="center", ha="left",
                             bbox=dict(boxstyle="round", fc="w"))
            # plt.show()
            plt.title('Test-Sp-{}  IV:{}'.format(var, round(iv_test, 6)))
            plt.subplot(121)
        # 训练集
        plt.bar(list(range(len(ds[1].values))), ds[1].values + ds[0].values,color='#86A4DA')
        plt.bar(list(range(len(ds[1].values))), ds[1].values, color='#EAAA79')
        plt.ylim((0, barylim))

        ax2 = plt.twinx()
        x = ds.index + 0.5
        y = ds[var +'_woe'].values
        # y = ds['Target_Rate'].values
        plt.plot(np.arange(len(x))+0.5, y, '.-k', linewidth=2, markersize=10, color='#7AB7B5')
        plt.plot(np.arange(len(x))+0.5, ds['Target_Rate'].values, '.-k', linewidth=2, markersize=10, color='#8C8C8C')
        plt.ylim((lineylim_min, lineylim_max))
        my_xticks = ds['bin'].values
        plt.xticks(x, my_xticks)

        for i,j in zip(np.arange(len(x))+0.5,y):
            ax2.annotate(str(round(j,4)),xy=(i,j),va="center", ha="left",
                      bbox=dict(boxstyle="round", fc="w"))

        for i,j in zip(np.arange(len(x))+0.5,ds['Target_Rate'].values):
            ax2.annotate(str(round(j,4)),xy=(i,j),va="center", ha="left",
                      bbox=dict(boxstyle="round", fc="w"))
        #plt.show()
        plt.title('Train-Sp-{}  IV:{}'.format(var,round(iv,6)))
        plt.savefig(var_char_loc + var+'.jpeg')
        plt.close()
    # print('=====k=====', datetime.now() - start_time_tes)
    ref_table = ds[['ref_table', var +'_woe', 'Target_Rate', 'Cnt']].rename(columns={'ref_table': 'Var_Value',
                                                                               var + '_woe': 'Ref_Value'
                                                                                      }).copy()
    total = pd.DataFrame({'Var_Value': ['base'],
                          'Ref_Value': [0.0],
                          'Target_Rate': [ds[1].sum()/ (ds[0] + ds[1]).sum()],
                          'Cnt': [ds['Cnt'].sum()]
                          })

    ref_table = pd.concat([ref_table, total[['Var_Value','Ref_Value','Target_Rate','Cnt']]])
    print(('  IV: {}'.format(iv)))
    #
    # ref_table = {}
    # ref_table = dict(list(zip(ds['ref_table'],ds[var +'_woe'])))
    # ref_table['base'] = woe_calc_base(ds[1].sum(),ds[0].sum())
    end_time = datetime.now()
    
    #print("\n  #--------------- get the reference table --------------#") 
    #print('  Duration of getting the reference table: {}'.format(end_time - start_time)) 

    # print('=====l=====', datetime.now() - start_time_tes)
    return ref_table,iv


def main_get_numeric_ref_table_by_bins(df,var,tgt,bins, var_char_loc='',dfins=None,drop_na=False,to_plot = True):
    start_time = datetime.now()
    '''
    ------- 1. Initialize: create the numeric bins -------#
    '''
    if drop_na:
        df = df[[var,tgt]].dropna(how='any').reset_index(drop=True).copy()

    df[var+'_bin'] = [tuple([float(j) for j in str(i).strip('([]').split(',')]) for i in np.array(pd.cut(df[var],bins,retbins=True,include_lowest = True)[0])]
    
    ds = df.groupby([var+'_bin',tgt]).count().unstack()[var].fillna(value=0)

    # print('=====d=====', datetime.now() - start_time_tes)
    
    ds['bin'] = [[str(i[0]),str(i[1])] for i in list(ds.index)]
    ds['bin_lb'] = [str(i[0]) for i in list(ds.index)]

    ds = ds.reset_index(drop = True)

    print(("  shape of the reduced table: {}".format(ds.shape))) 

    '''
    #-------- 3. chimerge: control bin size -------#
    '''    
    pop_cut = (ds[0].sum() + ds[1].sum())/20
    ds['pop'] = ds[0] + ds[1]

    print(("  shape of the final reduced table: {}".format(ds.shape)))
    # print('=====g=====', datetime.now() - start_time_tes)

    '''
    #------- get the reference table -------#
    '''
    ds['ref_table'] = None
    goodfreq = ds[0].sum()
    badfreq = ds[1].sum()
    ds[var +'_woe'] = ds.apply(lambda x: woe_calc(x[1],x[0],goodfreq,badfreq), axis = 1)
    ds['ref_table'] = ds['bin'].apply(lambda x: x[0] + '_' + x[1])
    iv = iv_calc(ds,var +'_woe')

    ds['Target_Rate'] = ds[1] / (ds[0] + ds[1])
    ds['Cnt'] = ds[0] + ds[1]
    
    if dfins is not None:
        # print(dfins[var].min(),dfins[var].max())
        bins = sorted(list(set([float(x) for x in list(chain.from_iterable(ds['bin'].tolist()))])))
        if bins[-1] < dfins[var].max():
            bins.pop()
            bins.append(dfins[var].max())

        if bins[0] > dfins[var].min():
            bins.pop(0)
            bins = [dfins[var].min()] + bins
        dfins[var + '_bin'] = [tuple([float(j) for j in str(i).strip('([]').split(',')]) for i in
                            np.array(pd.cut(dfins[var], bins, retbins=True, include_lowest=True)[0])]
        dss = dfins.groupby([var + '_bin', tgt]).count().unstack()[var].fillna(value=0)
        # display_html(dss)
        dss['bin'] = [[str(i[0]), str(i[1])] for i in list(dss.index)]
        dss['bin_lb'] = [str(i[0]) for i in list(dss.index)]
        dss = dss.reset_index(drop=True)
        tgoodfreq = dss[0].sum()
        tbadfreq = dss[1].sum()
        dss[var + '_woe'] = dss.apply(lambda x: woe_calc(x[1], x[0], tgoodfreq, tbadfreq), axis=1)
        iv_test = iv_calc(dss, var + '_woe')
        dss['Target_Rate'] = dss[1] / (dss[0] + dss[1])
        # display_html(dss)
    # print('=====i=====', datetime.now() - start_time_tes)

    if to_plot:
        barylim = ds[1].values.sum() + ds[0].values.sum() + (ds[1].values.sum() + ds[0].values.sum())/7
        minval_woe = ds[var +'_woe'].values.min()
        maxval_woe = ds[var +'_woe'].values.max()
        minval_trt = ds['Target_Rate'].values.min()
        maxval_trt = ds['Target_Rate'].values.max()
        if dfins is not None:
            minval_woe = min([minval_woe, dss[var +'_woe'].values.min()])
            maxval_woe = min([maxval_woe, dss[var +'_woe'].values.max()])
            minval_trt = min([minval_trt, dss['Target_Rate'].values.max()])
            maxval_trt = min([maxval_trt, dss['Target_Rate'].values.max()])
        minval = min([minval_woe, minval_trt])
        maxval = max([maxval_woe, maxval_trt])
        if minval < 0:
            lineylim_min = minval + minval / 2
        else:
            lineylim_min = minval - minval / 2
        if maxval > 0:
            lineylim_max = maxval + maxval / 2
        else:
            lineylim_max = maxval - maxval / 2

        if dfins is not None:
            # 测试集的图
            plt.figure(figsize=(18,7))
            plt.subplot(122)
            plt.bar(list(range(len(ds[1].values))), dss[1].values + dss[0].values,color='#86A4DA')
            plt.bar(list(range(len(ds[1].values))), dss[1].values, color='#EAAA79')
            plt.ylim((0, barylim))
            ax2 = plt.twinx()
            x = ds.index + 0.5
            y = dss[var +'_woe'].values
            # y = dss['Target_Rate'].values
            plt.plot(np.arange(len(x)) + 0.5, y, '.-k', linewidth=2, markersize=10, color='#7AB7B5')
            plt.plot(np.arange(len(x)) + 0.5, dss['Target_Rate'].values, '.-k', linewidth=2, markersize=10,color='#8C8C8C')

            plt.ylim((lineylim_min, lineylim_max))
            my_xticks = ds['bin'].values
            plt.xticks(x, my_xticks)

            for i, j in zip(np.arange(len(x)) + 0.5, y):
                ax2.annotate(str(round(j, 4)), xy=(i, j), va="center", ha="left",
                             bbox=dict(boxstyle="round", fc="w"))
            for i, j in zip(np.arange(len(x)) + 0.5, dss['Target_Rate'].values):
                ax2.annotate(str(round(j, 4)), xy=(i, j), va="center", ha="left",
                             bbox=dict(boxstyle="round", fc="w"))
            # plt.show()
            plt.title('Test-Sp-{}  IV:{}'.format(var, round(iv_test, 6)))
            plt.subplot(121)
        # 训练集
        plt.bar(list(range(len(ds[1].values))), ds[1].values + ds[0].values,color='#86A4DA')
        plt.bar(list(range(len(ds[1].values))), ds[1].values, color='#EAAA79')
        plt.ylim((0, barylim))

        ax2 = plt.twinx()
        x = ds.index + 0.5
        y = ds[var +'_woe'].values
        # y = ds['Target_Rate'].values
        plt.plot(np.arange(len(x))+0.5, y, '.-k', linewidth=2, markersize=10, color='#7AB7B5')
        plt.plot(np.arange(len(x))+0.5, ds['Target_Rate'].values, '.-k', linewidth=2, markersize=10, color='#8C8C8C')
        plt.ylim((lineylim_min, lineylim_max))
        my_xticks = ds['bin'].values
        plt.xticks(x, my_xticks)

        for i,j in zip(np.arange(len(x))+0.5,y):
            ax2.annotate(str(round(j,4)),xy=(i,j),va="center", ha="left",
                      bbox=dict(boxstyle="round", fc="w"))

        for i,j in zip(np.arange(len(x))+0.5,ds['Target_Rate'].values):
            ax2.annotate(str(round(j,4)),xy=(i,j),va="center", ha="left",
                      bbox=dict(boxstyle="round", fc="w"))
        #plt.show()
        plt.title('Train-Sp-{}  IV:{}'.format(var,round(iv,6)))
        plt.savefig(var_char_loc + var+'.jpeg')
        plt.close()
    # print('=====k=====', datetime.now() - start_time_tes)
    ref_table = ds[['ref_table', var +'_woe', 'Target_Rate', 'Cnt']].rename(columns={'ref_table': 'Var_Value',
                                                                               var + '_woe': 'Ref_Value'
                                                                                      }).copy()
    total = pd.DataFrame({'Var_Value': ['base'],
                          'Ref_Value': [0.0],
                          'Target_Rate': [ds[1].sum()/ (ds[0] + ds[1]).sum()],
                          'Cnt': [ds['Cnt'].sum()]
                          })

    ref_table = pd.concat([ref_table, total[['Var_Value','Ref_Value','Target_Rate','Cnt']]])
    print(('  IV: {}'.format(iv)))
    end_time = datetime.now()
    print('  Duration of getting the reference table: {}'.format(end_time - start_time)) 
    return ref_table,iv


# 计算数值型变量的WOE 根据固定bins值
def calcu_num_woe_iv_by_bins(df_var,var_list,tgt,bins,var_char_loc='',dfins=None,drop_na=1,to_plot=0):
    
    '''
    计算给定bin值的WOE
    df_var：特征变量数据，含target列；
    var_list：需要计算woe的字段名字，是一个list；
    tgt： target字段的名字；
    bins: 分的bin区间，给定的bin必须包括最小和最大值，bin是左开右闭的(除第一个bin外)；
    var_char_loc： 保存图片的地址；默认为空，即当前目录里面
    dfins: 测试集样本，如果传的话，将会把测试集在相同bin的情况下woe值和target rate 的图画出来；默认为空
    drop_na：  是否剔除空值；默认是剔除
    to_plot： 是否画图，默认是不作图；0:否，1：是
    '''
    df_numeric_ref_table = pd.DataFrame()

    # print(len(var_list))

    if len(var_list) == 1:
    #df_numeric_ref_table = pd.read_csv('./nwoe_table.csv')
        for var in var_list:
            print('\n#============ Start to process on the {} ============#'.format(var))
            # 最小值处理
            bin_min = df_var[var].min()
            if bin_min < bins[0]:
                bins = [bin_min] + bins
            # 最大值处理
            max_bin = df_var[var].max()
            if max_bin > bins[-1]:
                bins =  bins + [max_bin]
            df_ref_table_tmp,iv = main_get_numeric_ref_table_by_bins(df_var,var,tgt,bins,var_char_loc,dfins,drop_na,to_plot)
            # df_ref_table_tmp = pd.DataFrame(list(ref_table.items()), columns=['Var_Value', 'Ref_Value'])
            df_ref_table_tmp['Var_Name'] = var
            df_ref_table_tmp['IV'] = iv
            df_numeric_ref_table = pd.concat((df_numeric_ref_table,df_ref_table_tmp),axis = 0)
            display_html(df_ref_table_tmp)
    else:
        for var, bin_cid in zip(var_list,bins):
            print('\n#============ Start to process on the {} ============#'.format(var))
            # 最小值处理
            bin_min = df_var[var].min()
            if bin_min < bin_cid[0]:
                bin_cid = [bin_min] + bin_cid
            # 最大值处理
            max_bin = df_var[var].max()
            if max_bin > bin_cid[-1]:
                bin_cid =  bin_cid + [max_bin]
            df_ref_table_tmp,iv = main_get_numeric_ref_table_by_bins(df_var,var,tgt,bin_cid,var_char_loc,dfins,drop_na,to_plot)
            # df_ref_table_tmp = pd.DataFrame(list(ref_table.items()), columns=['Var_Value', 'Ref_Value'])
            df_ref_table_tmp['Var_Name'] = var
            df_ref_table_tmp['IV'] = iv
            df_numeric_ref_table = pd.concat((df_numeric_ref_table,df_ref_table_tmp),axis = 0)
            display_html(df_ref_table_tmp)

    return df_numeric_ref_table.sort_values('IV',ascending=False)


# 计算数值型变量的WOE 根据卡方检验
def calcu_num_woe_iv(df_var,var_list,tgt,max_bins=1000,var_char_loc='',dfins=None,drop_na=1,to_plot=0):
    '''
    df_var：特征变量数据，含target列；
    var_list：需要计算woe的字段名字，是一个list；
    tgt： target字段的名字；
    max_bins: 最大bin的个数，默认为1000
    var_char_loc： 保存图片的地址；默认为空，即当前目录里面
    dfins: 测试集样本，如果传的话，将会把测试集在相同bin的情况下woe值和target rate 的图画出来；默认为空
    drop_na：  是否剔除空值；默认是剔除
    to_plot： 是否画图，默认是不作图；0:否，1：是
    '''
    df_numeric_ref_table = pd.DataFrame()
    #df_numeric_ref_table = pd.read_csv('./nwoe_table.csv')
    for var in var_list:
        print('\n#============ Start to process on the {} ============#'.format(var))
        df_ref_table_tmp,iv = main_get_numeric_ref_table(df_var,var,tgt,max_bins,var_char_loc,dfins,drop_na,to_plot)
        # df_ref_table_tmp = pd.DataFrame(list(ref_table.items()), columns=['Var_Value', 'Ref_Value'])
        df_ref_table_tmp['Var_Name'] = var
        df_ref_table_tmp['IV'] = iv
        df_numeric_ref_table = pd.concat((df_numeric_ref_table,df_ref_table_tmp),axis = 0)
        display_html(df_ref_table_tmp)
    return df_numeric_ref_table.sort_values('IV',ascending=False)


# main function: get the reference table for categorical variables
def main_get_cat_ref_table(datain,var,tgt,to_plot = True):

    print(("  shape of the master table: {}".format(datain.shape)))    
    print("\n  #--------------- 1. master load successfully --------------#") 
    # group by var 
    start_time = datetime.now()
    df = datain.groupby([var,tgt]).count().unstack()[datain.columns[0]].fillna(value=0)
  
    # calclate the target rate or pop size
    #df['target_rt'] = df[1]/(df[1] + df[0])
    df['pop'] = df[1] + df[0]
    ds = df.sort(['pop'],ascending=[1])
    ds['bin'] = [str(i) for i in list(ds.index)]
    #del ds['target_rt']
    #del ds['pop']

             
    #------- get the reference table -------#
    ds['ref_table'] = None
    goodfreq = ds[0].sum()  
    badfreq = ds[1].sum()
    ds[var +'_woe'] = ds.apply(lambda x: woe_calc(x[1],x[0],goodfreq,badfreq), axis = 1)

    if to_plot:
        plt.bar(list(range(len(ds[1].values))), ds[1].values + ds[0].values)
        plt.bar(list(range(len(ds[1].values))), ds[1].values,color='r')
        ax2 = plt.twinx()
        x = ds.index
        y = ds[var +'_woe'].values
        plt.plot(np.arange(len(x))+0.5, y, '.-k', linewidth=2, markersize=10)
        for i,j in zip(np.arange(len(x))+0.5,y):
            ax2.annotate(str(round(j,2)),xy=(i,j),va="center", ha="left",
                      bbox=dict(boxstyle="round", fc="w"))

    iv = iv_calc(ds,var +'_woe')
    print(('  IV: {}'.format(iv)))
          
    ds_idx = ds.index
    ref_table = {}
    for i in range(len(ds.index)):
        bin_list = ds['bin'].ix[ds_idx[i]]
        ds['ref_table'].ix[ds_idx[i]] = bin_list + ' : ' + str(ds[var+'_woe'].ix[ds_idx[i]])
        
        ref_table[bin_list] = str(ds[var+'_woe'].ix[ds_idx[i]])            
    ref_table['base'] = woe_calc_base(ds[1].sum(),ds[0].sum())
    #print ref_table    
    print("\n  #--------------- get the reference table --------------#")    

    return ref_table


#  分类WOE
# main function: get the combined reference table for categorical variables
def main_get_comb_cat_ref_table(df,var,tgt,var_char_loc='',drop_na=1,to_plot = True):

    # print(df.head().T)
    print(("  shape of the master table: {}".format(df.shape)))    
    #print("\n  #--------------- 1. master load successfully --------------#") 
    

    if drop_na:
        df = df[[var,tgt]].dropna(how='any').reset_index(drop=True).copy()
    start_time = datetime.now()
    # group by var
    ds = df.groupby([var,tgt]).count().unstack()[df.columns[0]].fillna(value=0)
    ds = ds.reset_index(drop = False)

    # calclate the target rate or pop size

    ds['pop'] = ds[1] + ds[0]
    ds = ds.sort_values(['pop'],ascending=[1])
    ds['bin'] = None

    #------- merge the bins with samll size -------#
    
    ubd = (ds['pop'].sum())/1200000.0
    
    ds[var + '_comb'] = ds[var]
    if ds['pop'].min() < ubd:
        ds_idx_small_size = ds[ds['pop'] < ubd].index        
        ds_idx_small_size_list = [str(i) for i in ds_idx_small_size]
        ds_idx_comb = "_".join(ds_idx_small_size_list)

        
        ds['bin'][ds['pop'] < ubd] = ds_idx_comb
        ds_idx_comb_dict = {}
        
        for j in ds_idx_small_size:
            ds_idx_comb_dict[j] = ds_idx_comb
        df[var + '_comb'] = df[var].apply(lambda x: merge_samll_bins(x,ds_idx_comb_dict)) 
        
        ds = df.groupby([var + '_comb',tgt]).count().unstack()[df.columns[0]].fillna(value=0)
 
        ds['pop'] = ds[1] + ds[0]
        ds = ds.sort(['pop'],ascending=[1])
        ds = ds.reset_index(drop = False)

    ds['target_rt'] = ds[1]/(ds[1] + ds[0])
    ds = ds.sort_values(['target_rt'],ascending=[1])
    chisq = []
    for i in range(ds.shape[0]-1):
        try:
             chisq.append(cy.chi2_contingency(ds.iloc[i:i+2,][[1,2]])[0])
        except:
             chisq.append(cy.chi2_contingency(ds.iloc[i:i+2,][[0,1]])[0])
    chisq.append(9999999.0)   
    ds['chisq'] = chisq
    
    ds['bin'] = ds[var + '_comb'].apply(lambda x: str_convert(x))
    end_time = datetime.now()
    #ds_idx = ds.index
    print(("  shape of the grouped table: {}".format(ds.shape)))
    print(('  Duration of group process: {}'.format(end_time - start_time)))
    #print("\n  #--------------- 2. initailize the contingency table successfully --------------#")
    #------- chimerge: the adjacent bins -------#
    ds.reset_index(drop=True,inplace=True)
    start_time = datetime.now()
    while ds.shape[0] > 6:
        k = ds['chisq'].idxmin()
        #print ds.ix[ds_idx_list[k:k+2]],ds.shape[0],k
        # merge the adjacent bins, drop the second bin
        ds.iloc[k,1:3] = ds.iloc[k,1:3] + ds.iloc[k+1,1:3]
        ds['bin'].iat[k] = str(ds['bin'].iat[k]) + '_' + str(ds['bin'].iat[k+1])  
        ds.drop(k+1,inplace=True)
        ds.reset_index(drop = True,inplace=True)
        if k != 0:
            ds['chisq'].iat[k-1] = cy.chi2_contingency(ds.iloc[k-1:k+1,1:3])[0]
        if k < ds.shape[0] - 1:
            ds['chisq'].iat[k] = cy.chi2_contingency(ds.iloc[k:k+2,1:3])[0]
        else:
            ds['chisq'].iat[k] = 9999999.0
        #print("shape of the reduced table: ", ds.shape)
        if ds.chisq.min() > scipy.stats.chi2.ppf(0.95, 1):
            break
        #print ds
    end_time = datetime.now()
    
    #print("\n  #--------------- 3. Merge bins by chisq rules Done --------------#")
    print(('  Duration of merge bins by chisq rules: {}'.format(end_time - start_time)))  
    print(("  shape of the reduced table: {}".format(ds.shape)))    
    
    #------- chimerge: control bin size -------#
    pop_cut = float(ds[0].sum() + ds[1].sum())/20

    ds['pop'] = ds[0] + ds[1]

    while ds['pop'].min() < pop_cut:

        # ds_idx_list = list(ds.index)
        k = ds['pop'].idxmin()
        # k = ds_idx_list.index(ds[ds['pop'] == ds['pop'].min()].index[0]) 
        if k == ds.shape[0] - 1 :
            k -= 1
        # elif ds['chisq'].ix[ds_idx_list[k]] > ds['chisq'].ix[ds_idx_list[k-1]]:
        elif ds['chisq'].iat[k] > ds['chisq'].iat[k-1]:
            k -= 1
        ds.iloc[k,1:3] = ds.iloc[k,1:3] + ds.iloc[k+1,1:3]
        # ds.ix[ds_idx_list[k],1:3] = ds.ix[ds_idx_list[k],1:3] + ds.ix[ds_idx_list[k+1],1:3]
        ds['bin'].iat[k] = str(ds['bin'].iloc[k]) + '_' + str(ds['bin'].iloc[k+1])        
        ds.drop(k+1,inplace=True)
        ds.reset_index(drop = True,inplace=True)

        ds['pop'] = ds[0] + ds[1]
    
    #print("\n  #--------------- 4. Done: merge bins by bin size --------------#")  
    print(("  shape of the reduced table: {}".format(ds.shape)))
             
    #------- get the reference table -------#
    ds['ref_table'] = None
    goodfreq = ds[0].sum()  
    badfreq = ds[1].sum()
    ds[var +'_woe'] = ds.apply(lambda x: woe_calc(x[1],x[0],goodfreq,badfreq), axis = 1)
    iv = iv_calc(ds, var + '_woe')
    print(('  IV: {}'.format(iv)))
    ds['Target_Rate'] = ds[1] / (ds[0] + ds[1])
    ds['Cnt'] = ds[0] + ds[1]
    if to_plot:
        plt.bar(list(range(len(ds[1].values))), ds[1].values + ds[0].values,color='#86A4DA')
        plt.bar(list(range(len(ds[1].values))), ds[1].values,color='#EAAA79')
        ax2 = plt.twinx()
        x = ds.index + 0.5
        y = ds[var +'_woe'].values
        plt.plot(np.arange(len(x))+0.5, y, '.-k', linewidth=2, markersize=10, color='#7AB7B5')
        plt.plot(np.arange(len(x)) + 0.5, ds['Target_Rate'].values, '.-k', linewidth=2, markersize=10, color='#8C8C8C')
        my_xticks = ds['bin'].values
        plt.xticks(x, my_xticks)
        for i,j in zip(np.arange(len(x))+0.5,y):
            ax2.annotate(str(round(j,2)),xy=(i,j),va="center", ha="left",
                      bbox=dict(boxstyle="round", fc="w"))
        for i,j in zip(np.arange(len(x))+0.5,ds['Target_Rate'].values):
            ax2.annotate(str(round(j,4)),xy=(i,j),va="center", ha="left",
                      bbox=dict(boxstyle="round", fc="w"))
        plt.title('Train-Sp-{}  IV:{}'.format(var, round(iv, 6)))
        plt.savefig(var_char_loc + var + '.jpeg')
        plt.close()

    ds_idx = ds.index
    ref_table = {}
    for i in range(ds.shape[0]):
        bin_list = ds['bin'].iat[ds_idx[i]].split('_')
        ds['ref_table'].iat[ds_idx[i]] = [j + ' : ' + str(ds[var+'_woe'].iat[ds_idx[i]]) for j in bin_list]  
        for j in bin_list:
            ref_table[str(j)] = str(ds[var+'_woe'].iat[ds_idx[i]])
    ref_table['base'] = 0
    cwoe_ref_table = ds[['bin', var +'_woe', 'Target_Rate', 'Cnt']].rename(columns={'bin': 'Var_Value',
                                                                               var + '_woe': 'Ref_Value'
                                                                                      }).copy()
    total = pd.DataFrame({'Var_Value': ['base'],
                          'Ref_Value': [0.0],
                          'Target_Rate': [ds[1].sum()/ (ds[0] + ds[1]).sum()],
                          'Cnt': [ds['Cnt'].sum()]
                          })

    cwoe_ref_table = pd.concat([cwoe_ref_table, total[['Var_Value','Ref_Value','Target_Rate','Cnt']]])
    print("\n  #--------------- get the reference table --------------#")    

    return ref_table,iv, cwoe_ref_table.reset_index(drop=True)


# 计算分类变量的WOE 多变量同时计算
def calcu_categ_woe_iv(df_var,var_list,tgt,var_char_loc=' ',drop_na=0,to_plot=0):
    '''
    df_var：特征变量数据，含target列；
    var_list：需要计算woe的字段名字，是一个list；
    tgt： target字段的名字；
    var_char_loc： 保存图片的地址；默认为空，即当前目录里面
    dfins: 测试集样本，如果传的话，将会把测试集在相同bin的情况下woe值和target rate 的图画出来；默认为空
    drop_na：  是否剔除空值；默认是剔除
    to_plot： 是否画图，默认是作图；0:否，1：是
    '''
    df_category_ref_table = pd.DataFrame()
    df_cwoe_ref_table = pd.DataFrame()
    for var in var_list:
        print('\n#============ Start to process on the {} ============#'.format(var))
        ref_table,iv, cwoe_ref_table = main_get_comb_cat_ref_table(df_var,var,tgt,var_char_loc,drop_na,to_plot)
        df_ref_table_tmp = pd.DataFrame(list(ref_table.items()), columns=['Var_Value', 'Ref_Value'])
        df_ref_table_tmp['Var_Name'] = var
        df_ref_table_tmp['IV'] = iv
        cwoe_ref_table['Var_Name'] = var
        cwoe_ref_table['IV'] = iv
        df_category_ref_table = pd.concat((df_category_ref_table,df_ref_table_tmp),axis = 0)
        df_cwoe_ref_table = pd.concat((df_cwoe_ref_table,cwoe_ref_table),axis = 0)
        display_html(cwoe_ref_table)
    return df_category_ref_table.sort_values('IV',ascending=False), df_cwoe_ref_table.sort_values('IV',ascending=False)



# main function: get the combined reference table for categorical variables
def main_calc_cat_ref_table(df,var,tgt,value_list,to_plot = True):
    # print(value_list)

    print(("  shape of the master table: {}".format(df.shape)))    
    #print("\n  #--------------- 1. master load successfully --------------#") 

    len_bin = len(value_list)

    var_bin_dict = {}
    for i in range(len_bin):
        var_bin_dict[var + '_' + 'bin' + '_' + str(i)] = value_list[i]
    print(var_bin_dict)    
    df[var + '_comb'] = df[var].apply(lambda x: cag_woe_calc(var_bin_dict,x))   
    print((df[var + '_comb'].value_counts()))

    start_time = datetime.now()
    # group by var 
    ds = df.groupby([var + '_comb',tgt]).count().unstack()[df.columns[0]].fillna(value=0)
    ds = ds.reset_index(drop = False)

    # calclate the target rate or pop size

    ds['pop'] = ds[1] + ds[0]
    ds = ds.sort(['pop'],ascending=[1])
    ds['bin'] = None
        
    ds['bin'] = ds[var + '_comb'].apply(lambda x: bin_calc(var_bin_dict,x))
    print((ds['bin'].head()))

    #------- get the reference table -------#
    ds['ref_table'] = None
    goodfreq = ds[0].sum()  
    badfreq = ds[1].sum()
    ds[var +'_woe'] = ds.apply(lambda x: woe_calc(x[1],x[0],goodfreq,badfreq), axis = 1)

    if to_plot:
        plt.bar(list(range(len(ds[1].values))), ds[1].values + ds[0].values)
        plt.bar(list(range(len(ds[1].values))), ds[1].values,color='r')
        ax2 = plt.twinx()
        x = ds.index + 0.5
        y = ds[var +'_woe'].values
        plt.plot(np.arange(len(x))+0.5, y, '.-k', linewidth=2, markersize=10)
        #my_xticks = ds['bin'].values
        #plt.xticks(x, my_xticks)   
        for i,j in zip(np.arange(len(x))+0.5,y):
            ax2.annotate(str(round(j,2)),xy=(i,j),va="center", ha="left",
                      bbox=dict(boxstyle="round", fc="w"))

    iv = iv_calc(ds,var +'_woe')
    print(('  IV: {}'.format(iv)))
                
    ds_idx = ds.index
    ref_table = {}
    for i in range(len(ds.index)):
        bin_list = ds['bin'].ix[ds_idx[i]].split('_')
        

        ds['ref_table'].ix[ds_idx[i]] = [j + ' : ' + str(ds[var+'_woe'].ix[ds_idx[i]]) for j in bin_list]  
        for j in bin_list:
            ref_table[str(j)] = str(ds[var+'_woe'].ix[ds_idx[i]])
    ref_table['base'] = 0
  
    print("\n  #--------------- get the reference table --------------#")    

    return ref_table,iv

# main function: apply the numeric reference table
def main_apply_numeric_ref_table(datain,ref_table,var):
    
    datain['cwoe_' + var] = datain[var].apply(lambda x: nvlookup(ref_table,x))

    return datain



# main function: apply the categorical reference table
def main_apply_cat_ref_table(datain,ref_table,var):
    
    datain['cwoe_' + var] = datain[var].apply(lambda x: cvlookup(ref_table,str(x)))

    return datain


    
if __name__ == '__main__':

    infile = sys.stdin
    var = os.getenv("var")
    tgt = os.getenv("tgt")
    cref_out = os.getenv("cref_out")


    #get reference table & output    
    ins = pd.read_csv(infile)
    ins.shape 
    
    print("\n#==============================================#")
    print("#=               readin ins mst               =#") 
    print("#==============================================#")


    print(("  shape of the master table: {}".format(ins.shape)))

    
    # calculate woe
    df_ref_table = pd.DataFrame()

    print(('\n#============ Start to process on the {} ============#'.format(var)))
    ref_table = main_get_numeric_ref_table(ins,var,tgt,1000)
    df_ref_table_tmp = pd.DataFrame(list(ref_table.items()), columns=['Var_Value', 'Ref_Value'])
    df_ref_table_tmp['Var_Name'] = var
    df_ref_table = pd.concat((df_ref_table,df_ref_table_tmp),axis = 0)

    df_ref_table.to_csv(cref_out + '_' + var + '.csv') 
    

